Import libraries

Note: run intall.packages() before loading the packages.

# general data maniputlation: summarise, filter, etc.
# install.packages("dplyr")
library(dplyr)

#install.packages("plyr")
library(plyr)

# manipulation of date/time data
#install.packages("chron")
library(chron)

# interactive plots
#install.packages("scatterD3")
library(scatterD3)

#install.packages("plotly")
library(plotly)


#install.packages("RCurl")
library(RCurl)

Inspect Score Data

lines <- readLines(textConnection(getURL("https://raw.githubusercontent.com/dustinvanstee/nba-rt-prediction/master/scores_nba.test.dat")))

head(lines)
## [1] "2016-04-05,15:06:16,Phoenix,0,Atlanta,0,(8:00 PM ET),48.0,400829044"          "2016-04-05,15:06:16,Chicago,0,Memphis,0,(8:00 PM ET),48.0,400829045"          "2016-04-05,15:06:16,Cleveland,0,Milwaukee,0,(8:00 PM ET),48.0,400829046"      "2016-04-05,15:06:16,Oklahoma City,0,Denver,0,(9:00 PM ET),48.0,400829047"     "2016-04-05,15:06:16,New Orleans,0,Philadelphia,0,(7:00 PM ET),48.0,400829041" "2016-04-05,15:06:16,Detroit,0,Miami,0,(8:00 PM ET),48.0,400829042"

Load in NBA Score Data Sets

# turn the .dat file to dataframe
nba_scores_DF <- as.data.frame(do.call(rbind, strsplit(lines, ",")), stringsAsFactors=FALSE)
## Warning in (function (..., deparse.level = 1) : number of columns of result is not a multiple of vector length (arg 14628)
# Since I don't have a header in the data set, I want to specify the column metadata
colnames(nba_scores_DF) <- c("dateOrig","ts","teamlonga", "scorea", "teamlongb", "scoreb", "timestring", "timeleft", "gameid")
nba_scores_DF2 <- transform(nba_scores_DF, 
                            dateOrig = as.Date(dateOrig),
                            ts = as.character(ts),
                            teamlonga = as.character(teamlonga),
                            scorea = as.numeric (scorea),
                            teamlongb = as.character(teamlongb),
                            scoreb = as.numeric (scoreb),
                            timestring = as.character(timestring),
                            timeleft = as.numeric(timeleft),
                            gameid = as.character(gameid))
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion

Inspect Historical score data

This data is the raw input that contains a record for each update of the game. Data has some errors and redundancies that must be removed. Will discuss that as we go along …. in particular, we need to seperate the in game scores and the final score and re-merge them for our model

# NAs are introduced because the raw data has invalid data points, so remove these observations
rtscoresAndFinalDF <- na.omit(nba_scores_DF2)
dim(rtscoresAndFinalDF) #16746     9
## [1] 16746     9
head(rtscoresAndFinalDF)
##     dateOrig       ts     teamlonga scorea    teamlongb scoreb   timestring timeleft    gameid
## 1 2016-04-05 15:06:16       Phoenix      0      Atlanta      0 (8:00 PM ET)       48 400829044
## 2 2016-04-05 15:06:16       Chicago      0      Memphis      0 (8:00 PM ET)       48 400829045
## 3 2016-04-05 15:06:16     Cleveland      0    Milwaukee      0 (8:00 PM ET)       48 400829046
## 4 2016-04-05 15:06:16 Oklahoma City      0       Denver      0 (9:00 PM ET)       48 400829047
## 5 2016-04-05 15:06:16   New Orleans      0 Philadelphia      0 (7:00 PM ET)       48 400829041
## 6 2016-04-05 15:06:16       Detroit      0        Miami      0 (8:00 PM ET)       48 400829042
head(filter(rtscoresAndFinalDF, grepl("FINAL", timestring)))
##     dateOrig       ts   teamlonga scorea    teamlongb scoreb timestring timeleft    gameid
## 1 2016-04-05 21:22:09 New Orleans     93 Philadelphia    107    (FINAL)        0 400829041
## 2 2016-04-05 22:08:42   Charlotte     90      Toronto     96    (FINAL)        0 400829043
## 3 2016-04-05 22:25:25     Chicago     92      Memphis    108    (FINAL)        0 400829045
## 4 2016-04-05 22:28:58     Phoenix     90      Atlanta    103    (FINAL)        0 400829044
## 5 2016-04-05 22:30:29   Cleveland    109    Milwaukee     80    (FINAL)        0 400829046
## 6 2016-04-05 22:30:29     Detroit     89        Miami    107    (FINAL)        0 400829042
head(filter(rtscoresAndFinalDF, grepl("1ST", timestring)))
##     dateOrig       ts   teamlonga scorea    teamlongb scoreb    timestring timeleft    gameid
## 1 2016-04-05 19:23:42 New Orleans     23 Philadelphia     12 (4:39 IN 1ST) 40.65000 400829041
## 2 2016-04-05 19:23:57 New Orleans     23 Philadelphia     14 (4:05 IN 1ST) 40.08333 400829041
## 3 2016-04-05 19:24:13 New Orleans     23 Philadelphia     14 (3:41 IN 1ST) 39.68333 400829041
## 4 2016-04-05 19:24:28 New Orleans     23 Philadelphia     14 (3:32 IN 1ST) 39.53333 400829041
## 5 2016-04-05 19:24:43 New Orleans     23 Philadelphia     16 (3:24 IN 1ST) 39.40000 400829041
## 6 2016-04-05 19:25:29 New Orleans     23 Philadelphia     16 (3:11 IN 1ST) 39.18333 400829041

UDFs for creating extra columns in real time data frame

These were a couple custom UDF’s I needed to cleanse the data and also to add a few features based on a proprietary way of combining the score with the time left.

# Function to turn long team name to short
teamMap <- function(x) {
  tnames <- data.frame(
    long = as.factor(c("Atlanta", "Boston", "Brooklyn", "Charlotte", "Chicago", 
                       "Cleveland", "Dallas", "Denver", "Detroit", "Golden State", 
                       "Houston","Indiana", "LA Clippers", "LA Lakers", "Memphis", 
                       "Miami", "Milwaukee", "Minnesota", "New Orleans", "New York",
                       "Oklahoma City", "Orlando", "Philadelphia", "Phila.", "Phoenix",
                       "Portland",  "Sacramento", "San Antonio", "Toronto", "Utah", "Washington")),
    short = as.factor(c("atl", "bos", "bkn", "cha", "chi",
                        "cle", "dal", "den", "det", "gst",
                        "hou", "ind", "lac", "lal", "mem",
                        "mia", "mil", "min", "nor", "nyk",
                        "okc", "orl", "phi", "phi", "pho",
                        "por", "sac", "san", "tor", "uta", "wsh"))
  )
  df_x <- data.frame(long=x)
  short <- tnames$short[match(df_x$long, tnames$long)]
  return(short)
  
}

# Function to convert 3-character month to 2-digit numeric month
monthMap <-function(x) {
  a <-data.frame(
    str = as.factor(c("Jan", "Feb", "Mar", "Apr", "May", 
                      "Jun", "Jul", "Aug", "Sep", "Oct", 
                      "Nov", "Dec")),
    num = as.factor(c("01", "02", "03", "04", "05",
                      "06", "07", "08", "09", "10",
                      "11", "12"))
  )
  df_x <- data.frame(str=x)
  num <- a$num[match(df_x$str, a$str)]
  return(num) 
}

# Date Logic to adjust for games that finish on the day after ....
# This is due to not having a great key to join my tables ...
dateadjustudf <- function(datein, tsin){
                   newdate <- c()
                   for (i in 1:length(tsin)){
                      if (grepl("^0[0-3]", tsin[i])) {
                          newdate[i] = datein[i] - 1
                      } else {
                          newdate[i] = datein[i]
                      }
                    }
                   return(newdate)
                  }

# UDFs to create some extra features ... this one is for an experiemental combination of Time left and Score difference.  
# Made this via intuition.  This can be extended to add other custom features
# val crossOverTime = 8
# val exponentScaler = 0.5
# There is no need to create UDFs here

Preproces the Real Time and Final Score Data . Add some useful columns to the data set

Here I create some extra columns for later use.

# Remove Overtime games from this analysis
rtscoresAndFinalDF <- filter(rtscoresAndFinalDF, !grepl(".*OT.*", timestring))
#16626

# Create short 3 character team names
rtscoresAndFinalDF$teama <- teamMap(rtscoresAndFinalDF$teamlonga)
rtscoresAndFinalDF$teamb <- teamMap(rtscoresAndFinalDF$teamlongb)

# Add a score differential Column 
rtscoresAndFinalDF$scorea_scoreb <- rtscoresAndFinalDF$scorea - rtscoresAndFinalDF$scoreb

# Transform the Date.  This is for games that spanned multiple days and gave me a headache.  
# Games adjusted to the day they started on.
rtscoresAndFinalDF$date <-  dateadjustudf(rtscoresAndFinalDF$dateOrig, rtscoresAndFinalDF$ts)
rtscoresAndFinalDF$date <- as.Date(rtscoresAndFinalDF$date, origin = "1970-01-01")

# Create a Key for me to use to join with odds data later.  Key = date.teama.teamb
for (i in 1:nrow(rtscoresAndFinalDF)){
  rtscoresAndFinalDF$key[i] <- paste0(rtscoresAndFinalDF$date[i], ".", rtscoresAndFinalDF$teama[i], ".", rtscoresAndFinalDF$teamb[i])
}

Separate The Real Time And Final Data From One Common Dataframe To Two Dataframes

Currently based on the way the data was sampled, both real time scores and final scores are written as seperate records to the same file. I need to pull these apart, and then join the dataframes so that I have a real time score and features and know if the game was won or lost ….

# Create Final Score DF
# Note a shortcut for repeating the dataframe within the filter is to use a $df.filter(df("foo").contains ... is equiv to df.filter($"foo".contains)
finalscoresDF <- filter(rtscoresAndFinalDF, grepl("FINAL", timestring))

# Rename some columns so that join later doesnt have name overlaps
finalscoresDF$fscorea <- finalscoresDF$scorea
finalscoresDF$fscoreb <- finalscoresDF$scoreb

# Create final score difference
finalscoresDF$fscorea_fscoreb <- finalscoresDF$fscorea - finalscoresDF$fscoreb
finalscoresDF$fscoreb_fscorea <- finalscoresDF$fscoreb - finalscoresDF$fscorea


# Add a Win/loss column Win = 1, Loss = 0
for (i in 1 : nrow(finalscoresDF)){
  if (finalscoresDF$fscorea_fscoreb[i] > 0){
    finalscoresDF$home_win[i] <- 0
    finalscoresDF$away_win[i] <- 1
  } else {
    finalscoresDF$home_win[i] <- 1
    finalscoresDF$away_win[i] <- 0
  }
}


#################################################################################################################
# Create Real time score DF and more wrangling

# Remove Halftime records and these other cases as my datasource doesnt always change the quarter well
# as this particular case isn't handled well... (for now)
rtscoresDF <- filter(rtscoresAndFinalDF, !grepl('HALF', timestring), !grepl('FINAL', timestring),
                   timestring != "(12:00 IN 1ST)" ,
                   timestring != "(12:00 IN 2ND)" , 
                   timestring != "(12:00 IN 3RD)" ,
                   timestring != "(12:00 IN 4TH)" ,  
                   timestring != "(END OF 1ST)" ,
                   timestring != "(END OF 2ND)" , 
                   timestring != "(END OF 3RD)" ,
                   timestring != "(END OF 4TH)" )


# Create real time score difference
rtscoresDF$scorea_scoreb <-  rtscoresDF$scorea - rtscoresDF$scoreb
rtscoresDF$scoreb_scorea <-  rtscoresDF$scoreb - rtscoresDF$scorea


# Create a game PCT complete and PCT left indictor
rtscoresDF$pct_complete <- (((rtscoresDF$timeleft * -1) + 48 )/48.0)*100
rtscoresDF$pct_left <- 100 - rtscoresDF$pct_complete

# Create a unique feature. Idea here is that I have intuition that timeleft and score difference are a strong predictor when combined
rtscoresDF$cf1 <- (1/((rtscoresDF$pct_left/25 + .01)^.5)) * rtscoresDF$scoreb_scorea
rtscoresDF$cf2 <- (1/((rtscoresDF$pct_left/2.0 + .01)^1.3))*rtscoresDF$scoreb_scorea

Custom Feature Explanation

After building my initial model, I noticed that the logistic model was adjusting the probabilities well at the end of the games. I had some examples where I had 0 time left in the game, and yet the logistic model was giving a 70% chance of victory for a team. I speculated this was due to the fact that my original features were not fitting the end of game very well. To fix this, I created a spreader custom feature that basically takes the score difference and amplifies it as the score nears the end of the game. This way this feature is very predictive at the end of games and can help adjust the probablities to be more certain at the end of games.

Show effect of custom spreader feature

# subset a dataframe for scatterplot
spreader <- filter(rtscoresDF, pct_complete < 95)

# draw interactive scatter plot
scatterD3(x = spreader$pct_complete, y = spreader$scoreb_scorea, col_var = spreader$key)

Inspect Custom features …

scatterD3(x = spreader$pct_complete, y = spreader$cf2, col_var = spreader$key)

***

Lets Take A Look Of What We Have For The Two Dataframes We Just Wrangled

## [1] "final scores data frame"
##     dateOrig       ts   teamlonga scorea    teamlongb scoreb timestring timeleft    gameid teama teamb scorea_scoreb       date                key fscorea fscoreb fscorea_fscoreb fscoreb_fscorea home_win away_win
## 1 2016-04-05 21:22:09 New Orleans     93 Philadelphia    107    (FINAL)        0 400829041   nor   phi           -14 2016-04-05 2016-04-05.nor.phi      93     107             -14              14        1        0
## 2 2016-04-05 22:08:42   Charlotte     90      Toronto     96    (FINAL)        0 400829043   cha   tor            -6 2016-04-05 2016-04-05.cha.tor      90      96              -6               6        1        0
## 3 2016-04-05 22:25:25     Chicago     92      Memphis    108    (FINAL)        0 400829045   chi   mem           -16 2016-04-05 2016-04-05.chi.mem      92     108             -16              16        1        0
## 4 2016-04-05 22:28:58     Phoenix     90      Atlanta    103    (FINAL)        0 400829044   pho   atl           -13 2016-04-05 2016-04-05.pho.atl      90     103             -13              13        1        0
## 5 2016-04-05 22:30:29   Cleveland    109    Milwaukee     80    (FINAL)        0 400829046   cle   mil            29 2016-04-05 2016-04-05.cle.mil     109      80              29             -29        0        1
## 6 2016-04-05 22:30:29     Detroit     89        Miami    107    (FINAL)        0 400829042   det   mia           -18 2016-04-05 2016-04-05.det.mia      89     107             -18              18        1        0
## [1] "Total Games = 116"
## [1] "real time scores data frame"
##     dateOrig       ts     teamlonga scorea    teamlongb scoreb   timestring timeleft    gameid teama teamb scorea_scoreb       date                key scoreb_scorea pct_complete pct_left cf1 cf2
## 1 2016-04-05 15:06:16       Phoenix      0      Atlanta      0 (8:00 PM ET)       48 400829044   pho   atl             0 2016-04-05 2016-04-05.pho.atl             0            0      100   0   0
## 2 2016-04-05 15:06:16       Chicago      0      Memphis      0 (8:00 PM ET)       48 400829045   chi   mem             0 2016-04-05 2016-04-05.chi.mem             0            0      100   0   0
## 3 2016-04-05 15:06:16     Cleveland      0    Milwaukee      0 (8:00 PM ET)       48 400829046   cle   mil             0 2016-04-05 2016-04-05.cle.mil             0            0      100   0   0
## 4 2016-04-05 15:06:16 Oklahoma City      0       Denver      0 (9:00 PM ET)       48 400829047   okc   den             0 2016-04-05 2016-04-05.okc.den             0            0      100   0   0
## 5 2016-04-05 15:06:16   New Orleans      0 Philadelphia      0 (7:00 PM ET)       48 400829041   nor   phi             0 2016-04-05 2016-04-05.nor.phi             0            0      100   0   0
## 6 2016-04-05 15:06:16       Detroit      0        Miami      0 (8:00 PM ET)       48 400829042   det   mia             0 2016-04-05 2016-04-05.det.mia             0            0      100   0   0
## [1] "Total Number of rt score records = 15947"

Inspect Odds Data

How to Read the Raw Odds data

Example Golden State -12.5 O (207.0) -125.0 | Detroit 12.5 U (207.0) 145.0
The away team is listed first, and the home team is second
Here Golden State is a 12.5 pt favorite to win.  The over under is in parentheses (207) and is the 50/50 line between teams sum of scores
being above/below that line.  
Finally the -125 / +145 numbers are whats known at the moneyline odds. 
    A negative number means you need to bet 125$ to get a 100$ payout
    A positive number means you need to bet 100$ to get a 145$ payout

load in odds data

xml <- readLines(textConnection(getURL("https://raw.githubusercontent.com/dustinvanstee/nba-rt-prediction/master/nbaodds_042516.xml")))

# use regular expression to catch info we need
odds <- lapply(xml, function(x) substr(x, regexpr(">", x) + 1, regexpr("/", x) - 2))
odds_split <- lapply(odds, function(x) unlist(strsplit(x, " ")))

# get teamlonga
teamlonga_0 <- lapply(odds_split, function(x) paste(x[1], x[2]))
teamlonga <- lapply(teamlonga_0, function(x){
  if (regexpr("[0-9|-]", x) > -1) {
    substr(x, 1, regexpr("[0-9|-]", x)-2) 
  } else{
    x 
  }
})

# get teamlongb
teamlongb_0 <- lapply(odds_split, function(x) paste(x[7],x[8], x[9]))
teamlongb_1 <- lapply(teamlongb_0, function(x){
  if (regexpr("[0-9]", x) > -1) {
    substr(x, regexpr("[A-Za-z]", x), regexpr("[0-9-]", x)-2) 
  } else{
    x 
  }
})

teamlongb <- lapply(teamlongb_1, function(x){
  if (regexpr("|", x) > -1){
    substr(x, regexpr("[A-Za-z]", x), nchar(x))
  } else {
    x
  }
})

# teamaspread
teamaspread_0 <- lapply(odds, function(x){
  substr(x, regexpr("[0-9-]",x), regexpr("[0-9-]",x)+4)
})

teamaspread <- lapply(teamaspread_0, function(x){
  if (regexpr("[ ]", x) > 0){
    substr(x, 1, regexpr("[ ]", x)-1)
  } else {
    x
  }
})

# overunder
overunder <- lapply(odds, function(x){
  substr(x, regexpr("[(]", x) + 1, regexpr("[)]", x) - 1)
})

# teamaml
teamaml <- lapply(odds, function(x){
  substr(x,regexpr("[)]", x) + 2, regexpr("[|]", x) - 2 )
})

# teambml
teambml <- lapply(odds, function(x){
  substr(x, gregexpr("[)]", x)[[1]][2]+2, gregexpr("[(]", x)[[1]][3]-2)
})


#get date
dateStr <- lapply(odds, function(x){
  month <- substr(x, gregexpr("[(]", x)[[1]][3]+1, gregexpr("[(]", x)[[1]][3]+3)
  month_num <- monthMap(month)
  date <- substr(x, gregexpr("[(]", x)[[1]][3]+5, gregexpr("[(]", x)[[1]][3]+6)
  year <- substr(x, gregexpr("[(]", x)[[1]][3]+9, gregexpr("[(]", x)[[1]][3]+12)
  paste0(year, "-", month_num, "-", date)
})

# get short team names
teama <- lapply(teamlonga, teamMap)
teamb <- lapply(teamlongb, teamMap)

# bind all column together into dataframe

oddsDF <- na.omit(do.call(rbind, Map(data.frame, teamlonga=teamlonga, teama=teama, teamlongb=teamlongb, teamb=teamb, teamaspread=teamaspread, overunder=overunder, teamaml=teamaml, teambml=teambml, dateStr=dateStr)))

# change to right data type and create a key for join later
oddsDF$teamaspread <- as.numeric(as.character(oddsDF$teamaspread))
oddsDF$overunder <- as.numeric(as.character(oddsDF$overunder))
oddsDF$teamaml <- as.numeric(as.character(oddsDF$teamaml))
oddsDF$teambml <- as.numeric(as.character(oddsDF$teambml))

oddsDF$teama <- as.character(oddsDF$teama)
oddsDF$teamb <- as.character(oddsDF$teamb)
oddsDF$key <- paste0(oddsDF$dateStr, ".", oddsDF$teama, ".", oddsDF$teamb)
dim(oddsDF) #161  10
## [1] 161  10
# add the groupby and average below because I was getting the game odds over multiple days, and it was adding noise to the analysis

oddsDF2 <- ddply(oddsDF, c("key", "teamlonga", "teamlongb", "teama", "teamb", "dateStr"), summarise,
               teamaspread = mean(teamaspread),
               overunder = mean(overunder),
               teamaml = mean(teamaml),
               teambml = mean(teambml))

# Create a few new columns for later analysis

oddsDF2$teambspread <- oddsDF2$teamaspread * -1

oddsDF2$teama_vegas_fscore <- (oddsDF2$overunder / 2.0) - (oddsDF2$teamaspread / 2.0)

oddsDF2$teamb_vegas_fscore <- (oddsDF2$overunder / 2.0) + (oddsDF2$teamaspread / 2.0)

Inspect some of the Odds Data

head(oddsDF2)
##                  key teamlonga    teamlongb teama teamb    dateStr teamaspread overunder teamaml teambml teambspread teama_vegas_fscore teamb_vegas_fscore
## 1 2016-04-05.cha.tor Charlotte      Toronto   cha   tor 2016-04-05         4.0     200.5     155    -175        -4.0              98.25             102.25
## 2 2016-04-05.chi.mem   Chicago      Memphis   chi   mem 2016-04-05        -3.0     201.5    -150     130         3.0             102.25              99.25
## 3 2016-04-05.cle.mil Cleveland    Milwaukee   cle   mil 2016-04-05        -7.5     203.0    -340     280         7.5             105.25              97.75
## 4 2016-04-05.det.mia   Detroit        Miami   det   mia 2016-04-05         4.0     202.0     160    -190        -4.0              99.00             103.00
## 5 2016-04-05.lal.lac LA Lakers  LA Clippers   lal   lac 2016-04-05        14.5     208.0    -110    -110       -14.5              96.75             111.25
## 6 2016-04-05.min.gst Minnesota Golden State   min   gst 2016-04-05        15.5     225.0    -110    -110       -15.5             104.75             120.25
paste("total home teams = ", length(unique(oddsDF2$teama)))
## [1] "total home teams =  30"
paste("total away teams = ", length(unique(oddsDF2$teamb)))
## [1] "total away teams =  30"
paste("total games collected = ", nrow(oddsDF2))
## [1] "total games collected =  111"

Avg Team Away Game Spread - ( hint < 0 means favorite)

Here we are averaging the away spread per team. If the bar is above the zero line, then the team is an underdog, and under the line the team is the favorite. 8 of the 32 teams were favorites on the road… and they are the likely suspect including CLE/GST/OKC

# visualize away spread data
avg_away_spread <- ddply(oddsDF2, c("teamlonga", "teamlongb"), summarise,
                         awayspread_avg_teamaspread = mean(teamaspread),
                         awayspread_avg_teambspread = mean(teambspread))

# away spread group by teama
away_spread_teama <- ddply(avg_away_spread, c("teamlonga"), summarise,
                           teamaspread = mean(awayspread_avg_teamaspread))
# order by teama
away_spread_teama$teamlonga <- as.character(away_spread_teama$teamlonga)
away_spread_teama <- away_spread_teama[order(away_spread_teama$teamlonga), ]

# barchart
p <- plot_ly(
  x = away_spread_teama$teamlonga,
  y = away_spread_teama$teamaspread,
  type = "bar")
p

Avg Home Team Game Spread - (Hint > 0 means underdog)

Here we are averaging the home spread per team. If the bar is above the zero line, then the team is an underdog, and under the line the team is the favorite. Note here that the home teams are favored much more, with the usual suspects having a very large advantage (SAN/GST/OKC)

# spread group by teamb
away_spread_teamb <- ddply(avg_away_spread, c("teamlongb"), summarise,
                           teambspread = mean(awayspread_avg_teambspread))
# order by teamb
away_spread_teamb$teamlongb <- as.character(away_spread_teamb$teamlongb)
away_spread_teamb <- away_spread_teamb[order(away_spread_teamb$teamlongb), ]


p <- plot_ly(
        x = away_spread_teamb$teamlongb,
        y = away_spread_teamb$teambspread,
        type = "bar")

    
p

Join odds and final scores data.

# Here is where we join the Odds/Realtime scores/ Final Scores into one wholistic data set as input for Logistic Machine Learning

# Create a smaller Final Score Dataframe.  Just keep the key, final score a and b, the win/loss indicator
finalslicedscoresDF <- finalscoresDF[c("key","fscorea", "fscoreb", "fscorea_fscoreb", "fscoreb_fscorea", "away_win", "home_win")]

# First Join the 2 smallest data frames ... odd and final.
gameDF <- merge(finalslicedscoresDF, oddsDF2, by = "key")
gameDF$teamlonga <- NULL
gameDF$teamlongb <- NULL
gameDF$teama <- NULL
gameDF$teamb <- NULL

# Print Out the Game Dataframe ... notice we have the odds data merged with the win loss data ....
print("gameDF")
## [1] "gameDF"
head(gameDF)
##                  key fscorea fscoreb fscorea_fscoreb fscoreb_fscorea away_win home_win    dateStr teamaspread overunder teamaml teambml teambspread teama_vegas_fscore teamb_vegas_fscore
## 1 2016-04-05.cha.tor      90      96              -6               6        0        1 2016-04-05         4.0     200.5     155    -175        -4.0              98.25             102.25
## 2 2016-04-05.chi.mem      92     108             -16              16        0        1 2016-04-05        -3.0     201.5    -150     130         3.0             102.25              99.25
## 3 2016-04-05.cle.mil     109      80              29             -29        1        0 2016-04-05        -7.5     203.0    -340     280         7.5             105.25              97.75
## 4 2016-04-05.det.mia      89     107             -18              18        0        1 2016-04-05         4.0     202.0     160    -190        -4.0              99.00             103.00
## 5 2016-04-05.lal.lac      81     103             -22              22        0        1 2016-04-05        14.5     208.0    -110    -110       -14.5              96.75             111.25
## 6 2016-04-05.nor.phi      93     107             -14              14        0        1 2016-04-05         2.5     207.0     125    -145        -2.5             102.25             104.75
paste("total games collected:", nrow(gameDF)) #103
## [1] "total games collected: 103"

Lets see if there are some correlations … Spread vs Final Score Difference

# Here we show that the better a team is (negative spread, the more they are likely to win ...)

#Here the spread at the start of the game is a decent predictor regarding the end result

# Final Score Difference vs Spread  
# Top Left indicates teams with a large pos spread will lose by a wider margin
# the line should approx pass through 0,0
# lower Right indicates teams with large neg spread will win by a wider margin 

# The logistic and linear models we build will quantify this for us later!

scatterD3(x = gameDF$fscoreb_fscorea, y = gameDF$teamaspread)

Home / Away sensitivity to Point Spread

# Here we can show another weak correlation of the vegas overunder/spread to the actual final outcome.
# vegas_fscore was calculated by taking overunder/2 +- the spread/2 to get a projection of
# the home/away teams score
# Here if the prediction and data were perfectly correlated, we would pass through the
# y=x line.  in general we follow that path
# we will see how this term plays when we dig into the linear model
# here only home team is shown, but same trend holds for away team

scatterD3(x = gameDF$teamb_vegas_fscore, y = gameDF$fscoreb)

Join The Game Dataframe With The Real Time Score Dataframe

# This is the bigger merge.  Merging the odds/final score data with the real time indicators ...
lrDF <- merge(gameDF, rtscoresDF, by = "key")
print("lrDF : Logistic Regression Data Frame")
## [1] "lrDF : Logistic Regression Data Frame"
head(lrDF)
##                  key fscorea fscoreb fscorea_fscoreb fscoreb_fscorea away_win home_win    dateStr teamaspread overunder teamaml teambml teambspread teama_vegas_fscore teamb_vegas_fscore   dateOrig       ts teamlonga scorea teamlongb scoreb     timestring timeleft    gameid teama teamb scorea_scoreb       date scoreb_scorea pct_complete pct_left       cf1        cf2
## 1 2016-04-05.cha.tor      90      96              -6               6        0        1 2016-04-05           4     200.5     155    -175          -4              98.25             102.25 2016-04-05 20:32:49 Charlotte     42   Toronto     52  (1:12 IN 2ND) 25.20000 400829043   cha   tor           -10 2016-04-05            10    47.500000 52.50000  6.884284 0.14286204
## 2 2016-04-05.cha.tor      90      96              -6               6        0        1 2016-04-05           4     200.5     155    -175          -4              98.25             102.25 2016-04-05 20:56:38 Charlotte     44   Toronto     58  (8:56 IN 3RD) 20.93333 400829043   cha   tor           -14 2016-04-05            14    56.388889 43.61111 10.569592 0.25452584
## 3 2016-04-05.cha.tor      90      96              -6               6        0        1 2016-04-05           4     200.5     155    -175          -4              98.25             102.25 2016-04-05 20:08:41 Charlotte     19   Toronto     28 (11:08 IN 2ND) 35.13333 400829043   cha   tor            -9 2016-04-05             9    26.805556 73.19444  5.250891 0.08348447
## 4 2016-04-05.cha.tor      90      96              -6               6        0        1 2016-04-05           4     200.5     155    -175          -4              98.25             102.25 2016-04-05 20:26:10 Charlotte     35   Toronto     41  (4:11 IN 2ND) 28.18333 400829043   cha   tor            -6 2016-04-05             6    41.284722 58.71528  3.906817 0.07411763
## 5 2016-04-05.cha.tor      90      96              -6               6        0        1 2016-04-05           4     200.5     155    -175          -4              98.25             102.25 2016-04-05 20:32:34 Charlotte     40   Toronto     52  (1:30 IN 2ND) 25.50000 400829043   cha   tor           -12 2016-04-05            12    46.875000 53.12500  8.212631 0.16881814
## 6 2016-04-05.cha.tor      90      96              -6               6        0        1 2016-04-05           4     200.5     155    -175          -4              98.25             102.25 2016-04-05 19:49:35 Charlotte      9   Toronto     12  (7:18 IN 1ST) 43.30000 400829043   cha   tor            -3 2016-04-05             3     9.791667 90.20833  1.577128 0.02120872
paste("total data points collected:", nrow(lrDF)) #13412
## [1] "total data points collected: 13412"

Add a Few More Features

# Add an overunder/spread adjusted projection as points are scored during the game
# I found this is a strong indicator
lrDF$teama_adj_fscore <- ((lrDF$pct_complete  * -1)/100 + 1) * lrDF$teama_vegas_fscore + lrDF$scorea
lrDF$teamb_adj_fscore <- ((lrDF$pct_complete  * -1)/100 + 1) * lrDF$teamb_vegas_fscore + lrDF$scoreb
lrDF$pfscoreb_pfscorea <- lrDF$teamb_adj_fscore - lrDF$teama_adj_fscore

Filter Out some Data due to data quality

# There is an issue with the data I had captured.  When a quarter transitions from 1st->2nd (etc,etc), sometime the timestring doesn't get updated properly.  Since I used the timestring to calculate the timeleft in the game, I would get some rogue data points.  
# Example, after 1 min in a game, something the two teams would have scores in the 20's, because it was really at 11 mins in the second quarter.  
# My solution was to use the final score sum, and then just scale that down to the time left in the game.  I would then compare to the sum of scores i had, and if it was significantly higher, I would remove them.  I did this by visual inspection ... 
# dfa = departure_from_avg

lrDF$dfa <- (lrDF$fscorea + lrDF$fscoreb)/48 * (lrDF$timeleft * -1 + 48) - (lrDF$scorea + lrDF$scoreb)
lrDF_filtered <- filter(lrDF, dfa > -30)

Lets Look at some stats from logistic Regression dataframe

summary(lrDF_filtered)
##      key               fscorea          fscoreb      fscorea_fscoreb  fscoreb_fscorea     away_win        home_win           dateStr      teamaspread        overunder        teamaml           teambml        teambspread      teama_vegas_fscore teamb_vegas_fscore    dateOrig               ts             teamlonga             scorea        teamlongb             scoreb        timestring           timeleft        gameid              teama          teamb      scorea_scoreb          date            scoreb_scorea      pct_complete       pct_left           cf1                cf2             teama_adj_fscore teamb_adj_fscore pfscoreb_pfscorea      dfa          
##  Length:13217       Min.   : 68.00   Min.   : 80.0   Min.   :-38.00   Min.   :-29.00   Min.   :0.000   Min.   :0.000   2016-04-13:1829   Min.   :-13.000   Min.   :180.5   Min.   :-553.33   Min.   :-750.0   Min.   :-19.000   Min.   : 84.38     Min.   : 84.92     Min.   :2016-04-05   Length:13217       Length:13217       Min.   :  0.00   Length:13217       Min.   :  0.00   Length:13217       Min.   : 0.00   Length:13217       cha    : 796   bos    : 889   Min.   :-44.000   Min.   :2016-04-05   Min.   :-33.000   Min.   :  0.00   Min.   :  0.00   Min.   :-290.000   Min.   :-11545.108   Min.   : 66.42   Min.   : 75.50   Min.   :-34.915   Min.   :-26.7865  
##  Class :character   1st Qu.: 92.00   1st Qu.: 97.0   1st Qu.:-15.00   1st Qu.: -5.00   1st Qu.:0.000   1st Qu.:0.000   2016-04-05:1452   1st Qu.: -3.500   1st Qu.:200.3   1st Qu.:-150.00   1st Qu.:-230.0   1st Qu.: -9.500   1st Qu.: 97.17     1st Qu.:100.50     1st Qu.:2016-04-08   Class :character   Class :character   1st Qu.: 28.00   Class :character   1st Qu.: 29.00   Class :character   1st Qu.:10.07   Class :character   san    : 716   mia    : 804   1st Qu.:-11.000   1st Qu.:2016-04-08   1st Qu.: -3.000   1st Qu.: 28.65   1st Qu.: 20.97   1st Qu.:  -2.362   1st Qu.:    -0.044   1st Qu.: 93.50   1st Qu.: 97.55   1st Qu.: -3.841   1st Qu.: -3.7944  
##  Mode  :character   Median : 99.00   Median :105.0   Median : -8.00   Median :  8.00   Median :0.000   Median :1.000   2016-04-08:1448   Median :  4.833   Median :205.5   Median :   7.50   Median :-127.5   Median : -4.833   Median :100.25     Median :104.50     Median :2016-04-11   Mode  :character   Mode  :character   Median : 53.00   Mode  :character   Median : 56.00   Mode  :character   Median :22.73   Mode  :character   okc    : 671   dal    : 687   Median : -3.000   Median :2016-04-11   Median :  3.000   Median : 52.64   Median : 47.36   Median :   1.821   Median :     0.033   Median : 99.58   Median :104.41   Median :  5.383   Median :  0.6389  
##                     Mean   : 99.71   Mean   :105.3   Mean   : -5.59   Mean   :  5.59   Mean   :0.353   Mean   :0.647   2016-04-11:1233   Mean   :  3.518   Mean   :204.8   Mean   :  26.41   Mean   :-122.4   Mean   : -3.518   Mean   :100.66     Mean   :104.18     Mean   :2016-04-11                                         Mean   : 52.85                      Mean   : 56.14                      Mean   :22.40                      cle    : 639   tor    : 639   Mean   : -3.293   Mean   :2016-04-11   Mean   :  3.293   Mean   : 53.32   Mean   : 46.68   Mean   :   4.799   Mean   :    31.508   Mean   : 99.85   Mean   :104.79   Mean   :  4.938   Mean   :  0.4491  
##                     3rd Qu.:107.00   3rd Qu.:113.0   3rd Qu.:  5.00   3rd Qu.: 15.00   3rd Qu.:1.000   3rd Qu.:1.000   2016-04-10:1168   3rd Qu.:  9.500   3rd Qu.:209.5   3rd Qu.: 190.00   3rd Qu.: 115.0   3rd Qu.:  3.500   3rd Qu.:103.50     3rd Qu.:108.75     3rd Qu.:2016-04-14                                         3rd Qu.: 77.00                      3rd Qu.: 82.00                      3rd Qu.:34.25                      tor    : 578   hou    : 593   3rd Qu.:  3.000   3rd Qu.:2016-04-13   3rd Qu.: 11.000   3rd Qu.: 79.03   3rd Qu.: 71.35   3rd Qu.:   9.017   3rd Qu.:     0.244   3rd Qu.:106.12   3rd Qu.:111.73   3rd Qu.: 14.159   3rd Qu.:  5.0549  
##                     Max.   :131.00   Max.   :144.0   Max.   : 29.00   Max.   : 38.00   Max.   :1.000   Max.   :1.000   2016-04-06:1108   Max.   : 19.000   Max.   :225.2   Max.   : 541.67   Max.   : 410.0   Max.   : 13.000   Max.   :115.75     Max.   :119.12     Max.   :2016-04-24                                         Max.   :131.00                      Max.   :144.00                      Max.   :48.00                      mem    : 577   ind    : 586   Max.   : 33.000   Max.   :2016-04-24   Max.   : 44.000   Max.   :100.00   Max.   :100.00   Max.   : 380.000   Max.   : 15128.072   Max.   :135.35   Max.   :146.36   Max.   : 47.362   Max.   : 21.3115  
##                                                                                                                        (Other)   :4979                                                                                                                                                                                                                                                                                                      (Other):9240   (Other):9019

Visualize some of our Time Series Data. …

# here we can see the trajectory of some of the games .....    
# upper left beginning ... upper right (win), lower right (loss)
# cool visual .... gives an idea about how the games flow
tsplot <- filter(lrDF_filtered, grepl("cle", key) | grepl("gst", key))

scatterD3(x = tsplot$pct_complete, y = tsplot$scoreb_scorea, col_var = tsplot$key)

Samples per Game Visualization - Data Quality check

DQ_check <- ddply(lrDF_filtered, c("key"), summarise,
                      N = length(key))
# order by N
DQ_check <- DQ_check[order(DQ_check$N),]

# plot
p <- plot_ly(
  x = DQ_check$Key,
  y = DQ_check$N,
  type = "bar")
p

Save Out Dataframe For Further Analysis with Logistic and Linear Regression Notebooks

# Wanted to save out the dataset at this point as I will branch into seperate work efforts for a Logistic/Linear model building
# drop some columns as we move on to next step !!

lrDF_final <- lrDF_filtered
lrDF_final$dateOrig <- NULL
lrDF_final$ts <- NULL
lrDF_final$teamlonga <- NULL
lrDF_final$teamlongb <- NULL
lrDF_final$timestring <- NULL
lrDF_final$gameid <- NULL
lrDF_final$teamaml <- NULL 
lrDF_final$teambml <- NULL
lrDF_final$dfa <- NULL
lrDF_final$teama <- NULL
lrDF_final$teamb <- NULL
lrDF_final$dateStr <- NULL
names(lrDF_final)
##  [1] "key"                "fscorea"            "fscoreb"            "fscorea_fscoreb"    "fscoreb_fscorea"    "away_win"           "home_win"           "teamaspread"        "overunder"          "teambspread"        "teama_vegas_fscore" "teamb_vegas_fscore" "scorea"             "scoreb"             "timeleft"           "scorea_scoreb"      "date"               "scoreb_scorea"      "pct_complete"       "pct_left"           "cf1"                "cf2"                "teama_adj_fscore"   "teamb_adj_fscore"   "pfscoreb_pfscorea"
head(lrDF_final)
##                  key fscorea fscoreb fscorea_fscoreb fscoreb_fscorea away_win home_win teamaspread overunder teambspread teama_vegas_fscore teamb_vegas_fscore scorea scoreb timeleft scorea_scoreb       date scoreb_scorea pct_complete pct_left       cf1        cf2 teama_adj_fscore teamb_adj_fscore pfscoreb_pfscorea
## 1 2016-04-05.cha.tor      90      96              -6               6        0        1           4     200.5          -4              98.25             102.25     42     52 25.20000           -10 2016-04-05            10    47.500000 52.50000  6.884284 0.14286204         93.58125         105.6813         12.100000
## 2 2016-04-05.cha.tor      90      96              -6               6        0        1           4     200.5          -4              98.25             102.25     44     58 20.93333           -14 2016-04-05            14    56.388889 43.61111 10.569592 0.25452584         86.84792         102.5924         15.744444
## 3 2016-04-05.cha.tor      90      96              -6               6        0        1           4     200.5          -4              98.25             102.25     19     28 35.13333            -9 2016-04-05             9    26.805556 73.19444  5.250891 0.08348447         90.91354         102.8413         11.927778
## 4 2016-04-05.cha.tor      90      96              -6               6        0        1           4     200.5          -4              98.25             102.25     35     41 28.18333            -6 2016-04-05             6    41.284722 58.71528  3.906817 0.07411763         92.68776         101.0364          8.348611
## 5 2016-04-05.cha.tor      90      96              -6               6        0        1           4     200.5          -4              98.25             102.25     40     52 25.50000           -12 2016-04-05            12    46.875000 53.12500  8.212631 0.16881814         92.19531         106.3203         14.125000
## 6 2016-04-05.cha.tor      90      96              -6               6        0        1           4     200.5          -4              98.25             102.25      9     12 43.30000            -3 2016-04-05             3     9.791667 90.20833  1.577128 0.02120872         97.62969         104.2380          6.608333
write.csv(lrDF_final, file = "nba-datawrangle-lrDF.csv")